Create figures for the manuscript, extended abstract and poster.
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggplot2)
require(ggrepel)
## Loading required package: ggrepel
require(magrittr)
## Loading required package: magrittr
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
## The following object is masked from 'package:igraph':
##
## crossing
set.seed(1510)
options(bitmapType='cairo')
`%ni%` <- Negate(`%in%`)
Un-normalized data:
input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data'
plots_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots'
numbers_file = paste(input_dir, 'dataset_numbers_complete_graph.txt', sep='/')
numbers_df = fread(numbers_file)
numbers_df$dataset = tolower(numbers_df$dataset)
input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/example_pearson_pval_0.05'
topology_results_file = paste(input_dir, 'topology_results_pearson_pval_0.05.txt', sep='/')
results_selected_df = fread(topology_results_file)
topology_results_by_size_file = paste(input_dir, 'topology_results_mean_pearson_pval_0.05.txt', sep='/')
topology_results_selected_by_size_df = fread(topology_results_by_size_file)
predictions_file = paste(input_dir, 'predictions_pearson_pval_0.05.txt', sep='/')
predicted_results_df = fread(predictions_file)
analytical_results_file = paste(input_dir, 'analytical_model_results_pearson_pval_0.05.txt', sep='/')
topology_results_selected_analytical_df = fread(analytical_results_file)
analytical_summary_file = paste(input_dir, 'analytical_model_summary_pearson_pval_0.05.txt', sep='/')
analytical_model_summary_df = fread(analytical_summary_file)
analytical_regression_results_file = paste(input_dir, 'analytical_model_regression_results_pearson_pval_0.05.txt', sep='/')
stretched_exponential_regression_df = fread(analytical_regression_results_file)
theoretical_sample_size_file = paste(input_dir, 'theoretical_sample_size_for_correlations_of_datasets.txt', sep='/')
sample_size_correlation_df = fread(theoretical_sample_size_file)
Normalized data:
topology_type_normalization = "divide.L"
input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/example_pearson_pval_0.05'
topology_results_file = paste(input_dir, '/', 'topology_results_norm_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
results_selected_norm_df = fread(topology_results_file)
topology_results_by_size_file = paste(input_dir, '/', 'topology_results_mean_norm_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
topology_results_selected_by_size_norm_df = fread(topology_results_by_size_file)
predictions_file = paste(input_dir, '/', 'predictions_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
predicted_results_norm_df = fread(predictions_file)
analytical_results_file = paste(input_dir, '/', 'analytical_model_results_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
topology_results_selected_analytical_norm_df = fread(analytical_results_file)
Number of networks and maximum sample size:
nrow((results_selected_df %>% dplyr::select(method, type_dataset, size, rep) %>% unique()))
## [1] 2584
max(((results_selected_df %>% dplyr::select(method, type_dataset, size, rep) %>% unique()))$size)
## [1] 1100
Number of networks and maximum sample size without counting TCGA full dataset:
nrow((results_selected_df %>% dplyr::select(method, type_dataset, size, rep) %>% filter(!(type_dataset == "tcga:tcga")) %>% unique()))
## [1] 2584
max(((results_selected_df %>% dplyr::select(method, type_dataset, size, rep) %>% filter(!(type_dataset == "tcga:tcga")) %>% unique()))$size)
## [1] 1100
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
results_selected_norm_df %>%
filter(type_dataset %in% datasets_selected) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
ggplot(aes(x=size, y=unnorm, col=type_dataset)) +
geom_point(alpha=0.5, size=3) +
#scale_x_continuous(trans = scales::log10_trans()) +
scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
theme_linedraw() +
xlab("Num. samples") +
#xlab("log(N)") +
ylab("Num. significant correlations") +
guides(col=guide_legend(title="Dataset")) +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))
plot_file = paste(plots_dir, "curve_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 10000,
height = 6000,
units = c("px")
)
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
stretched_exponential_regression_df %>% inner_join((topology_results_selected_analytical_df %>% dplyr::select(type_dataset, model, slope, intercept, a, b) %>% unique()), by=c("type_dataset", "model")) %>%
filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
ggplot(aes(x=x, y=y, col=type_dataset)) +
geom_point(alpha=0.5, size=3) +
geom_abline(aes(intercept=intercept, slope=slope, col=type_dataset), size=1) +
theme_linedraw() +
xlab("log(N)") +
ylab("log[ log(L) - log(S) ]") +
scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
guides(col=guide_legend(title="Dataset")) +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))
plot_file = paste(plots_dir, "regression_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9500,
height = 6000,
units = c("px")
)
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
analytical_model_summary_df$L_vs_total = abs((analytical_model_summary_df$unnorm_L - analytical_model_summary_df$total_num_edges)) / analytical_model_summary_df$total_num_edges
analytical_model_summary_df %>%
dplyr::select(type_dataset, model, a, b, L, L_vs_total, adj.r.squared, relative.error.mean) %>%
filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()
## type_dataset model
## 1: scipher:scipher.complete.dataset Stretched exponential (by linear fit)
## 2: gtex:artery.tibial Stretched exponential (by linear fit)
## 3: gtex:whole.blood Stretched exponential (by linear fit)
## 4: tcga:tcga-brca Stretched exponential (by linear fit)
## 5: tcga:tcga-coad Stretched exponential (by linear fit)
## a b L L_vs_total adj.r.squared relative.error.mean
## 1: 1.790429 42.98998 1.462534 0 0.9868786 0.13920918
## 2: 1.927336 106.91242 1.228832 0 0.9436667 0.15898866
## 3: 1.650687 21.33857 1.586486 0 0.9832295 0.16228112
## 4: 1.486656 18.55161 3.565799 0 0.9975507 0.06784906
## 5: 1.809127 59.17297 1.770780 0 0.9924464 0.08729841
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
results_selected_norm_df %>% right_join((predicted_results_norm_df %>% dplyr::select(type_dataset, model, model_result, size) %>% unique()), by=c("type_dataset", "size")) %>%
filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
#results_selected_norm_df %>%
# filter(type_dataset %in% datasets_selected) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
ggplot(aes(x=size, y=num_edges, col=type_dataset)) +
geom_point(alpha=0.5, size=3) +
#geom_line(data=(predicted_results_norm_df %>% filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()), aes(x=size, y=model_result, col=type_dataset), size=1) +
geom_line(aes(x=size, y=model_result, col=type_dataset), size=1) +
scale_x_continuous(trans = scales::log10_trans()) +
scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
theme_linedraw() +
xlab("Num. samples (Log)") +
ylab("Fraction of significant correlations") +
guides(col=guide_legend(title="Dataset")) +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))
## Warning: Removed 24825 rows containing missing values (geom_point).
plot_file = paste(plots_dir, "prediction_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9500,
height = 6000,
units = c("px")
)
## Warning: Removed 24825 rows containing missing values (geom_point).
models_selected = c("Stretched exponential (by linear fit)", "Logarithmic", "Mean")
dataset_selected = "gtex:whole.blood"
# Join predicted results with mean
predicted_results_norm_and_mean_df = rbind((topology_results_selected_by_size_norm_df %>% dplyr::select(type_dataset, size, mean_norm) %>% unique() %>% rename("model_result" = "mean_norm") %>% mutate(model="Mean")), (predicted_results_norm_df %>% dplyr::select(type_dataset, size, model_result, model) %>% unique())) %>%
filter(model %in% models_selected) %>%
mutate(model = replace(model, model == "Stretched exponential (by linear fit)", "Stretched exp.")) %>%
mutate(model = replace(model, model == "Logarithmic", "Logarithm"))
predicted_results_norm_and_mean_df$model <- factor(predicted_results_norm_and_mean_df$model, levels = c("Mean", "Stretched exp.", "Logarithm"))
# Process and plot
results_selected_norm_df %>%
right_join(predicted_results_norm_and_mean_df, by=c("type_dataset", "size")) %>%
filter(type_dataset == dataset_selected) %>%
filter(size <= max((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
filter(size >= min((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
ggplot(aes(x=size, y=num_edges, col=model)) +
geom_point(alpha=0.5, size=3, col="#91faad") +
geom_line(aes(x=size, y=model_result, col=model), size=1) +
scale_x_continuous(trans = scales::log10_trans()) +
scale_color_manual(values = c("#00f549", "#02b0f5", "#fc9403")) +
theme_linedraw() +
xlab("Num. samples (Log)") +
ylab("Fraction of significant correlations") +
#guides(col=guide_legend(title="Model")) +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_blank(), legend.position="bottom")
## Warning: Removed 72 rows containing missing values (geom_point).
plot_file = paste(plots_dir, "comparison_models_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 6600,
height = 6700,
units = c("px")
)
## Warning: Removed 72 rows containing missing values (geom_point).
#' calculate_predictions_using_stretched_exponential_model_optimized
#' Formula to calculate exponential decay of significant interactions from a list of sample sizes.
#' This formula requires the use of the L parameter
#' @param x List of sample sizes.
#' @param a Slope coefficient.
#' @param b Intercept coefficient.
#'
calculate_predictions_using_stretched_exponential_model_optimized = function(x, L, a, b){
y = L * exp((b*x**(-a+1))/(-a+1))
#y = exp(log(L) - exp(b+a*log(x)))
return(y)
}
#' calculate_prediction_from_analytical_model
#' Function to calculate predictions using a specific analytical model
#' @param model Name of the model used.
#' @param x_list List of values in the x axis (e.g. size)
#' @param a Parameter a.
#' @param b Parameter b.
#' @param L Parameter L.
#'
calculate_prediction_from_analytical_model = function(model, x_list, a, b, L){
if(model == "Logarithmic"){
prediction_result = (log(x_list)*a + b)
} else if((model == "Stretched exponential (by optimization)") | (model == "Stretched exponential (by linear fit)")){
prediction_result = calculate_predictions_using_stretched_exponential_model_optimized(x=x_list, L=L, a=a, b=b)
} else if(model == "Stretched exponential (without L)"){
prediction_result = calculate_predictions_using_stretched_exponential_model_without_L(x=x_list, a=a, b=b)
}
return(prediction_result)
}
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
sample_size_vs_a_df = analytical_model_summary_df %>% inner_join(sample_size_correlation_df, by=c("type_dataset"="dataset")) %>%
#filter((type_dataset %in% datasets_selected) & (model == model_selected))
filter(model == model_selected)
# Calculate number of edges for each critical correlation using model
sample_size_vs_a_df$num_edges_from_statistical_corrected = calculate_prediction_from_analytical_model(model=model_selected, x_list=sample_size_vs_a_df$sample_size_statistical_corrected, a=sample_size_vs_a_df$a, b=sample_size_vs_a_df$b, L=sample_size_vs_a_df$L)
sample_size_vs_a_df$num_edges_from_statistical_corrected = sample_size_vs_a_df$num_edges_from_statistical_corrected * sample_size_vs_a_df$max_value_in_dataset
sample_size_vs_a_df$num_edges_from_statistical_corrected_norm = sample_size_vs_a_df$num_edges_from_statistical_corrected / sample_size_vs_a_df$total_num_edges
type_correlation_selected = "weak"
sample_size_vs_a_df %>%
filter(type_correlation == type_correlation_selected) %>%
arrange(desc(a))
## type_dataset model
## 1: gtex:artery.tibial Stretched exponential (by linear fit)
## 2: gtex:skin.sun.exposed.lower.leg Stretched exponential (by linear fit)
## 3: tcga:tcga-coad Stretched exponential (by linear fit)
## 4: gtex:thyroid Stretched exponential (by linear fit)
## 5: gtex:muscle.skeletal Stretched exponential (by linear fit)
## 6: tcga:tcga-luad Stretched exponential (by linear fit)
## 7: tcga:tcga-ucec Stretched exponential (by linear fit)
## 8: gtex:lung Stretched exponential (by linear fit)
## 9: gtex:whole.blood Stretched exponential (by linear fit)
## 10: tcga:tcga-thca Stretched exponential (by linear fit)
## 11: tcga:tcga-kirc Stretched exponential (by linear fit)
## 12: tcga:tcga-lusc Stretched exponential (by linear fit)
## 13: tcga:tcga-lgg Stretched exponential (by linear fit)
## 14: tcga:tcga-skcm Stretched exponential (by linear fit)
## 15: tcga:tcga-brca Stretched exponential (by linear fit)
## a b L max_value_in_dataset
## 1: 1.927336 106.91242 1.228832 111967462
## 2: 1.836919 69.54141 1.443081 109822205
## 3: 1.809127 59.17297 1.770780 88426012
## 4: 1.746114 46.00874 1.669286 95645379
## 5: 1.722551 41.77794 1.598193 79087483
## 6: 1.656657 39.41186 2.709381 68040939
## 7: 1.654527 32.50631 2.274983 69647418
## 8: 1.654318 28.07262 1.964261 82744314
## 9: 1.650687 21.33857 1.586486 72312862
## 10: 1.592448 22.92988 2.677422 66112661
## 11: 1.575560 24.71561 3.405391 54688392
## 12: 1.543983 25.78059 4.921295 38777887
## 13: 1.528879 19.73056 3.959177 47356290
## 14: 1.503901 20.60499 6.322108 25885542
## 15: 1.486656 18.55161 3.565799 51204938
## formula adj.r.squared
## 1: F(x) = 1.23 * exp[(106.91 * x**((1.93) + 1)) / ((1.93) + 1) ] 0.9436667
## 2: F(x) = 1.44 * exp[(69.54 * x**((1.84) + 1)) / ((1.84) + 1) ] 0.9921006
## 3: F(x) = 1.77 * exp[(59.17 * x**((1.81) + 1)) / ((1.81) + 1) ] 0.9924464
## 4: F(x) = 1.67 * exp[(46.01 * x**((1.75) + 1)) / ((1.75) + 1) ] 0.9898823
## 5: F(x) = 1.6 * exp[(41.78 * x**((1.72) + 1)) / ((1.72) + 1) ] 0.9944041
## 6: F(x) = 2.71 * exp[(39.41 * x**((1.66) + 1)) / ((1.66) + 1) ] 0.9974881
## 7: F(x) = 2.27 * exp[(32.51 * x**((1.65) + 1)) / ((1.65) + 1) ] 0.9949493
## 8: F(x) = 1.96 * exp[(28.07 * x**((1.65) + 1)) / ((1.65) + 1) ] 0.9838666
## 9: F(x) = 1.59 * exp[(21.34 * x**((1.65) + 1)) / ((1.65) + 1) ] 0.9832295
## 10: F(x) = 2.68 * exp[(22.93 * x**((1.59) + 1)) / ((1.59) + 1) ] 0.9960270
## 11: F(x) = 3.41 * exp[(24.72 * x**((1.58) + 1)) / ((1.58) + 1) ] 0.9944937
## 12: F(x) = 4.92 * exp[(25.78 * x**((1.54) + 1)) / ((1.54) + 1) ] 0.9875920
## 13: F(x) = 3.96 * exp[(19.73 * x**((1.53) + 1)) / ((1.53) + 1) ] 0.9979693
## 14: F(x) = 6.32 * exp[(20.6 * x**((1.5) + 1)) / ((1.5) + 1) ] 0.9782098
## 15: F(x) = 3.57 * exp[(18.55 * x**((1.49) + 1)) / ((1.49) + 1) ] 0.9975507
## relative.error.mean unnorm_L total_num_edges density L_vs_total
## 1: 0.15898866 137589166 137589166 1 0
## 2: 0.13088726 158482306 158482306 1 0
## 3: 0.08729841 156583056 156583056 1 0
## 4: 0.14868418 159659515 159659515 1 0
## 5: 0.07910128 126397050 126397050 1 0
## 6: 0.07100511 184348801 184348801 1 0
## 7: 0.08393989 158446701 158446701 1 0
## 8: 0.28316122 162531435 162531435 1 0
## 9: 0.16228112 114723378 114723378 1 0
## 10: 0.15280302 177011520 177011520 1 0
## 11: 0.11524896 186235350 186235350 1 0
## 12: 0.15592063 190837416 190837416 1 0
## 13: 0.07097702 187491930 187491930 1 0
## 14: 0.21145193 163651186 163651186 1 0
## 15: 0.06784906 182586495 182586495 1 0
## type_correlation correlation sample_size_statistical
## 1: weak 0.2 68.77302
## 2: weak 0.2 68.77302
## 3: weak 0.2 68.77302
## 4: weak 0.2 68.77302
## 5: weak 0.2 68.77302
## 6: weak 0.2 68.77302
## 7: weak 0.2 68.77302
## 8: weak 0.2 68.77302
## 9: weak 0.2 68.77302
## 10: weak 0.2 68.77302
## 11: weak 0.2 68.77302
## 12: weak 0.2 68.77302
## 13: weak 0.2 68.77302
## 14: weak 0.2 68.77302
## 15: weak 0.2 68.77302
## sample_size_statistical_corrected num_edges_from_statistical_corrected
## 1: 911.1627 111794287
## 2: 916.9078 120302266
## 3: 939.7179 117463600
## 4: 920.8103 109314656
## 5: 894.8027 82560413
## 6: 946.9803 94655796
## 7: 939.6221 90275648
## 8: 921.3918 99277579
## 9: 886.0931 77194741
## 10: 945.7492 90762327
## 11: 947.8885 81129977
## 12: 948.9429 61156192
## 13: 947.5851 69369551
## 14: 941.3899 44707084
## 15: 946.9359 46983418
## num_edges_from_statistical_corrected_norm
## 1: 0.8125225
## 2: 0.7590896
## 3: 0.7501680
## 4: 0.6846736
## 5: 0.6531831
## 6: 0.5134603
## 7: 0.5697540
## 8: 0.6108208
## 9: 0.6728772
## 10: 0.5127481
## 11: 0.4356315
## 12: 0.3204623
## 13: 0.3699869
## 14: 0.2731852
## 15: 0.2573214
type_correlation_selected = "weak"
#sample_size_vs_a_df$data_col = ifelse(sample_size_vs_a_df$type_dataset == "gtex:whole.blood", "#E69F00", ifelse(sample_size_vs_a_df$type_dataset == "gtex:artery.tibial", "#D55E00", ifelse(sample_size_vs_a_df$type_dataset == "tcga:tcga-coad", "#56B4E9", ifelse(sample_size_vs_a_df$type_dataset == "tcga:tcga-brca", "#0072B2", "black"))))
sample_size_vs_a_df %>%
filter(type_correlation == type_correlation_selected) %>%
filter(!(type_dataset == "tcga:tcga")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
ggplot(aes(x=a, y=num_edges_from_statistical_corrected_norm)) +
geom_point() +
geom_point(data = . %>% filter(type_dataset == "TCGA: Colon cancer"), color = "#56B4E9") +
geom_point(data = . %>% filter(type_dataset == "TCGA: Breast cancer"), color = "#0072B2") +
geom_point(data = . %>% filter(type_dataset == "R. arthritis"), color = "#44AA99") +
geom_point(data = . %>% filter(type_dataset == "GTEx: Whole blood"), color = "#E69F00") +
geom_point(data = . %>% filter(type_dataset == "GTEx: Artery tibial"), color = "#D55E00") +
xlab(expression(alpha)) +
ylab("Fraction of correlations above 0.2") +
#guides(col=guide_legend(title="Dataset")) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold")) +
geom_text_repel(
data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected)), type_dataset == "gtex:whole.blood"),
aes(label = type_dataset), col = "#E69F00",
size = 5,
box.padding = unit(0.5, "lines"),
point.padding = unit(0.3, "lines")
) +
geom_text_repel(
data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected)), type_dataset == "gtex:artery.tibial"),
aes(label = type_dataset), col = "#D55E00",
size = 5,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines")
) +
geom_text_repel(
data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected) %>% mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis"))), type_dataset == "R. arthritis"),
aes(label = type_dataset), col = "#44AA99",
size = 5,
box.padding = unit(0.5, "lines"),
point.padding = unit(0.3, "lines")
) +
geom_text_repel(
data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected)), type_dataset == "tcga:tcga-brca"),
aes(label = type_dataset), col = "#0072B2",
size = 5,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines")
) +
geom_text_repel(
data = subset((sample_size_vs_a_df %>% filter(type_correlation == type_correlation_selected)), type_dataset == "tcga:tcga-coad"),
aes(label = type_dataset), col = "#56B4E9",
size = 5,
box.padding = unit(0.5, "lines"),
point.padding = unit(0.3, "lines")
)
plot_file = paste(plots_dir, "a_vs_fraction_sig_correlations_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 6400,
height = 6000,
units = c("px")
)
type_correlation_selected = "weak"
#sample_size_vs_a_df$data_col = ifelse(sample_size_vs_a_df$type_dataset == "gtex:whole.blood", "#E69F00", ifelse(sample_size_vs_a_df$type_dataset == "gtex:artery.tibial", "#D55E00", ifelse(sample_size_vs_a_df$type_dataset == "tcga:tcga-coad", "#56B4E9", ifelse(sample_size_vs_a_df$type_dataset == "tcga:tcga-brca", "#0072B2", "black"))))
sample_size_vs_a_df %>%
filter(type_correlation == type_correlation_selected) %>%
filter(!(type_dataset == "tcga:tcga")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
ggplot(aes(x=a, y=num_edges_from_statistical_corrected_norm)) +
geom_point() +
geom_point(data = . %>% filter(type_dataset == "TCGA: Colon cancer"), color = "#56B4E9") +
geom_point(data = . %>% filter(type_dataset == "TCGA: Breast cancer"), color = "#0072B2") +
geom_point(data = . %>% filter(type_dataset == "R. arthritis"), color = "#44AA99") +
geom_point(data = . %>% filter(type_dataset == "GTEx: Whole blood"), color = "#E69F00") +
geom_point(data = . %>% filter(type_dataset == "GTEx: Artery tibial"), color = "#D55E00") +
xlab(expression(alpha)) +
ylab("Fraction of correlations above 0.2") +
#guides(col=guide_legend(title="Dataset")) +
theme_linedraw() +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))
plot_file = paste(plots_dir, "a_vs_fraction_sig_correlations_pearson_pval_0.05_no_labels.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 6400,
height = 6000,
units = c("px")
)
#(sample_size_vs_a_df %>% select("type_dataset", "a") %>% unique()) %>% inner_join((results_df %>% select("type_dataset", "size") %>% unique()), by=c("type_dataset")) %>%
# group_by(type_dataset, a) %>%
# summarize(max_size = max(size)) %>%
# ggplot(aes(x=a, y=max_size)) +
# geom_point()
Define function to calculate critical sample size for correlation:
calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization = function(r, total_num_edges, alpha=0.05, N_guess=c(10000)){
optimize_N = function(par){
N = par[1]
t_guess = qt(alpha, df=N-2, lower.tail = F)
t = r * sqrt((N-2)) / sqrt((1-r**2))
return((abs(t-t_guess)))
}
res = optim(par=N_guess, fn=optimize_N, method="Brent", lower=3, upper=50000)
return(res$par)
}
calculate_predictions_using_stretched_exponential_model_optimized = function(x, L, a, b){
y = L * exp((b*x**(-a+1))/(-a+1))
#y = exp(log(L) - exp(b+a*log(x)))
return(y)
}
Calculate sample size for different levels of correlation and different datasets:
type_correlation_df = data.frame(type_correlation=c("weak", "moderate", "strong", "very strong"), lower_val=c(0.2,0.4,0.6,0.8), upper_val=c(0.4,0.6,0.8,NA))
types_correlation = c("weak", "moderate", "strong", "very strong")
cols = c("dataset", "type_correlation", "correlation", "sample_size_statistical", "sample_size_statistical_corrected")
sample_size_correlation_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
for (dataset in numbers_df$dataset){
total_num_edges = (numbers_df %>% filter(dataset == !!dataset))$total_num_edges
for (type_correlation in types_correlation){
r = (type_correlation_df %>% filter(type_correlation == !!type_correlation))$lower_val
N = calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization(r=r, total_num_edges=total_num_edges, alpha=0.05, N_guess=c(10000))
N_corrected = calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization(r=r, total_num_edges=total_num_edges, alpha=0.05/total_num_edges, N_guess=c(10000))
sample_size_correlation_df <- rbind(sample_size_correlation_df, data.frame(dataset=dataset, type_correlation=type_correlation, correlation=r, sample_size_statistical=N, sample_size_statistical_corrected=N_corrected))
}
}
print(sample_size_correlation_df)
## dataset type_correlation correlation
## 1 scipher:scipher.complete.dataset weak 0.2
## 2 scipher:scipher.complete.dataset moderate 0.4
## 3 scipher:scipher.complete.dataset strong 0.6
## 4 scipher:scipher.complete.dataset very strong 0.8
## 5 scipher:scipher.complete.nonresponder weak 0.2
## 6 scipher:scipher.complete.nonresponder moderate 0.4
## 7 scipher:scipher.complete.nonresponder strong 0.6
## 8 scipher:scipher.complete.nonresponder very strong 0.8
## 9 scipher:scipher.complete.responder weak 0.2
## 10 scipher:scipher.complete.responder moderate 0.4
## 11 scipher:scipher.complete.responder strong 0.6
## 12 scipher:scipher.complete.responder very strong 0.8
## 13 scipher:scipher.sample.per.patient.baseline weak 0.2
## 14 scipher:scipher.sample.per.patient.baseline moderate 0.4
## 15 scipher:scipher.sample.per.patient.baseline strong 0.6
## 16 scipher:scipher.sample.per.patient.baseline very strong 0.8
## 17 tcga:tcga-brca weak 0.2
## 18 tcga:tcga-brca moderate 0.4
## 19 tcga:tcga-brca strong 0.6
## 20 tcga:tcga-brca very strong 0.8
## 21 tcga:tcga-brca-normal weak 0.2
## 22 tcga:tcga-brca-normal moderate 0.4
## 23 tcga:tcga-brca-normal strong 0.6
## 24 tcga:tcga-brca-normal very strong 0.8
## 25 tcga:tcga-coad weak 0.2
## 26 tcga:tcga-coad moderate 0.4
## 27 tcga:tcga-coad strong 0.6
## 28 tcga:tcga-coad very strong 0.8
## 29 tcga:tcga-coad-normal weak 0.2
## 30 tcga:tcga-coad-normal moderate 0.4
## 31 tcga:tcga-coad-normal strong 0.6
## 32 tcga:tcga-coad-normal very strong 0.8
## 33 tcga:tcga-kirc weak 0.2
## 34 tcga:tcga-kirc moderate 0.4
## 35 tcga:tcga-kirc strong 0.6
## 36 tcga:tcga-kirc very strong 0.8
## 37 tcga:tcga-kirc-normal weak 0.2
## 38 tcga:tcga-kirc-normal moderate 0.4
## 39 tcga:tcga-kirc-normal strong 0.6
## 40 tcga:tcga-kirc-normal very strong 0.8
## 41 tcga:tcga-lgg weak 0.2
## 42 tcga:tcga-lgg moderate 0.4
## 43 tcga:tcga-lgg strong 0.6
## 44 tcga:tcga-lgg very strong 0.8
## 45 tcga:tcga-luad weak 0.2
## 46 tcga:tcga-luad moderate 0.4
## 47 tcga:tcga-luad strong 0.6
## 48 tcga:tcga-luad very strong 0.8
## 49 tcga:tcga-luad-normal weak 0.2
## 50 tcga:tcga-luad-normal moderate 0.4
## 51 tcga:tcga-luad-normal strong 0.6
## 52 tcga:tcga-luad-normal very strong 0.8
## 53 tcga:tcga-lusc weak 0.2
## 54 tcga:tcga-lusc moderate 0.4
## 55 tcga:tcga-lusc strong 0.6
## 56 tcga:tcga-lusc very strong 0.8
## 57 tcga:tcga-lusc-normal weak 0.2
## 58 tcga:tcga-lusc-normal moderate 0.4
## 59 tcga:tcga-lusc-normal strong 0.6
## 60 tcga:tcga-lusc-normal very strong 0.8
## 61 tcga:tcga-skcm weak 0.2
## 62 tcga:tcga-skcm moderate 0.4
## 63 tcga:tcga-skcm strong 0.6
## 64 tcga:tcga-skcm very strong 0.8
## 65 tcga:tcga-thca weak 0.2
## 66 tcga:tcga-thca moderate 0.4
## 67 tcga:tcga-thca strong 0.6
## 68 tcga:tcga-thca very strong 0.8
## 69 tcga:tcga-thca-normal weak 0.2
## 70 tcga:tcga-thca-normal moderate 0.4
## 71 tcga:tcga-thca-normal strong 0.6
## 72 tcga:tcga-thca-normal very strong 0.8
## 73 tcga:tcga-ucec weak 0.2
## 74 tcga:tcga-ucec moderate 0.4
## 75 tcga:tcga-ucec strong 0.6
## 76 tcga:tcga-ucec very strong 0.8
## 77 tcga:tcga-ucec-normal weak 0.2
## 78 tcga:tcga-ucec-normal moderate 0.4
## 79 tcga:tcga-ucec-normal strong 0.6
## 80 tcga:tcga-ucec-normal very strong 0.8
## 81 gtex:artery.tibial weak 0.2
## 82 gtex:artery.tibial moderate 0.4
## 83 gtex:artery.tibial strong 0.6
## 84 gtex:artery.tibial very strong 0.8
## 85 gtex:breast.mammary.tissue weak 0.2
## 86 gtex:breast.mammary.tissue moderate 0.4
## 87 gtex:breast.mammary.tissue strong 0.6
## 88 gtex:breast.mammary.tissue very strong 0.8
## 89 gtex:lung weak 0.2
## 90 gtex:lung moderate 0.4
## 91 gtex:lung strong 0.6
## 92 gtex:lung very strong 0.8
## 93 gtex:muscle.skeletal weak 0.2
## 94 gtex:muscle.skeletal moderate 0.4
## 95 gtex:muscle.skeletal strong 0.6
## 96 gtex:muscle.skeletal very strong 0.8
## 97 gtex:skin.sun.exposed.lower.leg weak 0.2
## 98 gtex:skin.sun.exposed.lower.leg moderate 0.4
## 99 gtex:skin.sun.exposed.lower.leg strong 0.6
## 100 gtex:skin.sun.exposed.lower.leg very strong 0.8
## 101 gtex:thyroid weak 0.2
## 102 gtex:thyroid moderate 0.4
## 103 gtex:thyroid strong 0.6
## 104 gtex:thyroid very strong 0.8
## 105 gtex:whole.blood weak 0.2
## 106 gtex:whole.blood moderate 0.4
## 107 gtex:whole.blood strong 0.6
## 108 gtex:whole.blood very strong 0.8
## sample_size_statistical sample_size_statistical_corrected
## 1 68.773025 914.63989
## 2 18.002295 216.05522
## 3 8.523611 85.91378
## 4 5.063346 38.90078
## 5 68.773025 915.15007
## 6 18.002295 216.17467
## 7 8.523611 85.96045
## 8 5.063346 38.92117
## 9 68.773025 914.79186
## 10 18.002295 216.09080
## 11 8.523611 85.92768
## 12 5.063346 38.90685
## 13 68.773025 914.18952
## 14 18.002295 215.94977
## 15 8.523611 85.87258
## 16 5.063346 38.88278
## 17 68.773025 945.60421
## 18 18.002295 223.30503
## 19 8.523611 88.74614
## 20 5.063346 40.13813
## 21 68.773025 948.92354
## 22 18.002295 224.08220
## 23 8.523611 89.04977
## 24 5.063346 40.27077
## 25 68.773025 938.25966
## 26 18.002295 221.58542
## 27 8.523611 88.07432
## 28 5.063346 39.84464
## 29 68.773025 944.05564
## 30 18.002295 222.94246
## 31 8.523611 88.60449
## 32 5.063346 40.07625
## 33 68.773025 946.55023
## 34 18.002295 223.52652
## 35 8.523611 88.83268
## 36 5.063346 40.17593
## 37 68.773025 946.85197
## 38 18.002295 223.59717
## 39 8.523611 88.86028
## 40 5.063346 40.18799
## 41 68.773025 946.87172
## 42 18.002295 223.60180
## 43 8.523611 88.86208
## 44 5.063346 40.18878
## 45 68.773025 946.06344
## 46 18.002295 223.41255
## 47 8.523611 88.78815
## 48 5.063346 40.15648
## 49 68.773025 942.86340
## 50 18.002295 222.66332
## 51 8.523611 88.49544
## 52 5.063346 40.02860
## 53 68.773025 947.71732
## 54 18.002295 223.79978
## 55 8.523611 88.93943
## 56 5.063346 40.22257
## 57 68.773025 948.72516
## 58 18.002295 224.03575
## 59 8.523611 89.03162
## 60 5.063346 40.26284
## 61 68.773025 940.37012
## 62 18.002295 222.07955
## 63 8.523611 88.26737
## 64 5.063346 39.92897
## 65 68.773025 944.12173
## 66 18.002295 222.95793
## 67 8.523611 88.61054
## 68 5.063346 40.07889
## 69 68.773025 947.49190
## 70 18.002295 223.74700
## 71 8.523611 88.91881
## 72 5.063346 40.21356
## 73 68.773025 938.82521
## 74 18.002295 221.71783
## 75 8.523611 88.12605
## 76 5.063346 39.86724
## 77 68.773025 942.98176
## 78 18.002295 222.69102
## 79 8.523611 88.50626
## 80 5.063346 40.03333
## 81 68.773025 932.07888
## 82 18.002295 220.13829
## 83 8.523611 87.50896
## 84 5.063346 39.59765
## 85 68.773025 938.69085
## 86 18.002295 221.68638
## 87 8.523611 88.11376
## 88 5.063346 39.86187
## 89 68.773025 940.04191
## 90 18.002295 222.00271
## 91 8.523611 88.23735
## 92 5.063346 39.91586
## 93 68.773025 928.02415
## 94 18.002295 219.18894
## 95 8.523611 87.13806
## 96 5.063346 39.43562
## 97 68.773025 938.83594
## 98 18.002295 221.72035
## 99 8.523611 88.12704
## 100 5.063346 39.86767
## 101 68.773025 939.18970
## 102 18.002295 221.80317
## 103 8.523611 88.15940
## 104 5.063346 39.88180
## 105 68.773025 923.39357
## 106 18.002295 218.10476
## 107 8.523611 86.71449
## 108 5.063346 39.25058
dataset_selected = "gtex:whole.blood"
model_selected = "Stretched exponential (by linear fit)"
results_analytical_gtex_wb = topology_results_selected_analytical_norm_df %>%
filter((model == model_selected) & (type_dataset == dataset_selected)) %>%
select(model, type_dataset, a, b, L, max_value_in_dataset) %>% unique()
sample_size_correlation_gtex_wb = sample_size_correlation_df %>%
filter((dataset == dataset_selected) & (!(type_correlation == "weak")))
predictions_convergence_gtex_wb_norm = calculate_predictions_using_stretched_exponential_model_optimized(x=sample_size_correlation_gtex_wb$sample_size_statistical_corrected, L=results_analytical_gtex_wb$L, a=results_analytical_gtex_wb$a, b=results_analytical_gtex_wb$b)
predictions_convergence_gtex_wb_df = data.frame(size=round(sample_size_correlation_gtex_wb$sample_size_statistical_corrected), num_edges=predictions_convergence_gtex_wb_norm * results_analytical_gtex_wb$max_value_in_dataset, num_edges_norm=predictions_convergence_gtex_wb_norm)
predictions_convergence_gtex_wb_df$num_edges_norm = (predictions_convergence_gtex_wb_df$num_edges/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
predictions_convergence_gtex_wb_df
## size num_edges num_edges_norm
## 1 218 42782199 0.37291614
## 2 87 19008527 0.16569009
## 3 39 5649757 0.04924678
predicted_results_wb_df = predicted_results_df %>% filter((model == model_selected) & (type_dataset == dataset_selected))
predicted_results_wb_df$model_result_norm = (predicted_results_wb_df$model_result/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
dataset_selected = "Whole.Blood"
all_topology_results_file = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/analysis_topology.csv'
threshold_selected = 0.05
all_results_df = fread(all_topology_results_file)
results_gtex_wb = all_results_df %>%
filter((type_dataset == dataset_selected) & (threshold == threshold_selected) & (type_correlation %in% c("weak-moderate-strong-very strong", "moderate-strong-very strong", "strong-very strong", "very strong")))
results_gtex_wb$num_edges_norm = (results_gtex_wb$num_edges/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
results_gtex_wb %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "weak-moderate-strong-very strong", "\u2265 0.2")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "moderate-strong-very strong", "\u2265 0.4")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "strong-very strong", "\u2265 0.6")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "very strong", "\u2265 0.8")) %>%
ggplot(aes(x=size, y=num_edges_norm, col=type_correlation)) +
geom_point(alpha=0.5, size=3) +
geom_line(data=(predicted_results_wb_df %>% filter(size <= max((all_results_df %>% filter((type_dataset == dataset_selected) & (threshold == threshold_selected)))$size))), aes(x=size, y=model_result_norm), size=1, color='red') +
geom_point(data=predictions_convergence_gtex_wb_df,
aes(x=size,y=num_edges_norm),
color='red',
size=3) +
#scale_x_continuous(trans = scales::log10_trans()) +
#scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
theme_linedraw() +
xlab("Number of samples") +
#xlab("log(N)") +
ylab("Fraction of significant correlations") +
guides(col=guide_legend(title="Correlation level")) +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))
plot_file = paste(plots_dir, "correlation_levels_pearson_pval_0.05_data_gtexwholeblood.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
all_disease_genes_results_file = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/analysis_disease_genes.csv'
threshold_selected = 0.05
dataset_selected = "scipher:scipher.complete.dataset"
all_disease_genes_results_df = fread(all_disease_genes_results_file)
all_disease_genes_results_df$type_dataset = paste(all_disease_genes_results_df$dataset, all_disease_genes_results_df$type_dataset, sep=":") # Join dataset and type_dataset
all_disease_genes_results_df$type_dataset = tolower(all_disease_genes_results_df$type_dataset)
results_scipher_df = all_disease_genes_results_df %>%
filter((type_dataset == dataset_selected) & (threshold == threshold_selected))
head(results_scipher_df)
## method dataset type_dataset type_tcga_tissue size rep
## 1: pearson scipher scipher:scipher.complete.dataset 100 1
## 2: pearson scipher scipher:scipher.complete.dataset 100 1
## 3: pearson scipher scipher:scipher.complete.dataset 100 1
## 4: pearson scipher scipher:scipher.complete.dataset 100 1
## 5: pearson scipher scipher:scipher.complete.dataset 100 1
## 6: pearson scipher scipher:scipher.complete.dataset 100 1
## type_correlation threshold disease
## 1: all 0.05 alzheimer disease
## 2: all 0.05 alzheimer disease
## 3: all 0.05 heart failure
## 4: all 0.05 diabetes mellitus type 2
## 5: all 0.05 diabetes mellitus type 2
## 6: all 0.05 asthma
## disease_class num_disease_genes num_disease_edges
## 1: mental disorders 226 5637
## 2: nervous system diseases 226 5637
## 3: cardiovascular diseases 104 1021
## 4: endocrine system diseases 168 3361
## 5: nutritional and metabolic diseases 168 3361
## 6: immune system diseases 279 9087
## fraction_disease_genes num_disease_components num_disease_lcc_nodes
## 1: 0.8828125 21 224
## 2: 0.8828125 21 224
## 3: 0.8524590 13 104
## 4: 0.8795812 14 168
## 5: 0.8795812 14 168
## 6: 0.8971061 23 279
## num_disease_lcc_edges fraction_disease_lcc_nodes disease_lcc_z
## 1: 5636 0.8750000 3.808188
## 2: 5636 0.8750000 3.808188
## 3: 1021 0.8524590 3.011924
## 4: 3361 0.8795812 3.695956
## 5: 3361 0.8795812 3.695956
## 6: 9087 0.8971061 4.232955
## disease_lcc_pvalue
## 1: 0.000000e+00
## 2: 0.000000e+00
## 3: 7.076959e-07
## 4: 0.000000e+00
## 5: 0.000000e+00
## 6: 0.000000e+00
#cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
diseases_selected = c("arthritis rheumatoid", "asthma", "breast neoplasms", "heart failure")
results_scipher_df %>%
filter((disease %in% diseases_selected) & (type_correlation == "moderate-strong-very strong")) %>%
group_by(size, disease) %>%
summarize(mean_pval = mean(disease_lcc_pvalue)) %>%
ggplot(aes(x=size, y=abs(log10(mean_pval)), col=disease)) +
#geom_point(alpha=0.5, size=3) +
geom_line(size=1) +
geom_hline(yintercept=as.numeric(abs(log10(0.05))), linetype="dashed", color="red", size=0.5) +
#scale_x_continuous(trans = scales::log10_trans()) +
#scale_color_manual(values = cbbPalette) +
theme_linedraw() +
xlab("Number of samples") +
#xlab("log(N)") +
ylab("Disease LCC log p-value (abs)") +
guides(col=guide_legend(title="Disease")) +
theme(plot.title = element_text(size = 17, face="bold"),
axis.title = element_text(size = 16, face="bold"),
axis.text = element_text(size = 15),
#legend.position = "bottom",
legend.text = element_text(size = 14),
legend.title=element_text(size=15, face="bold"))
## `summarise()` has grouped output by 'size'. You can override using the `.groups`
## argument.
plot_file = paste(plots_dir, "disease_module_significance_by_diseases_pearson_pval_0.05_data_sciphercomplete.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9700,
height = 6000,
units = c("px")
)
results_scipher_df %>%
filter(disease == "arthritis rheumatoid") %>%
filter((type_dataset == dataset_selected) & (threshold == threshold_selected) & (type_correlation %in% c("weak-moderate-strong-very strong", "moderate-strong-very strong", "strong-very strong", "very strong"))) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "weak-moderate-strong-very strong", "\u2265 0.2")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "moderate-strong-very strong", "\u2265 0.4")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "strong-very strong", "\u2265 0.6")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "very strong", "\u2265 0.8")) %>%
group_by(size, type_correlation) %>%
mutate(mean_pval = mean(disease_lcc_pvalue)) %>%
ungroup() %>%
ggplot() +
geom_point(aes(x=size, y=abs(log10(disease_lcc_pvalue)), col=type_correlation), alpha=0.4, size=3) +
geom_line(aes(x=size, y=abs(log10(mean_pval)), col=type_correlation), size=1) +
geom_hline(yintercept=as.numeric(abs(log10(0.05))), linetype="dashed", color="red", size=0.5) +
#scale_x_continuous(trans = scales::log10_trans()) +
#scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
theme_linedraw() +
xlab("Number of samples") +
#xlab("log(N)") +
ylab("Disease LCC log p-value (abs)") +
guides(col=guide_legend(title="Correlation level")) +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))
plot_file = paste(plots_dir, "disease_module_significance_by_correlations_pearson_pval_0.05_data_sciphercomplete.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
results_scipher_df %>%
filter(disease == "arthritis rheumatoid") %>%
filter((type_dataset == dataset_selected) & (threshold == threshold_selected) & (type_correlation %in% c("weak-moderate-strong-very strong", "moderate-strong-very strong", "strong-very strong", "very strong"))) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "weak-moderate-strong-very strong", "\u2265 0.2")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "moderate-strong-very strong", "\u2265 0.4")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "strong-very strong", "\u2265 0.6")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "very strong", "\u2265 0.8")) %>%
group_by(size, type_correlation) %>%
mutate(mean_rlcc = mean(fraction_disease_lcc_nodes)) %>%
ungroup() %>%
ggplot() +
geom_point(aes(x=size, y=fraction_disease_lcc_nodes, col=type_correlation), alpha=0.4, size=3) +
geom_line(aes(x=size, y=mean_rlcc, col=type_correlation), size=1) +
#scale_x_continuous(trans = scales::log10_trans()) +
#scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
theme_linedraw() +
xlab("Number of samples") +
#xlab("log(N)") +
ylab("Fraction of disease genes in the LCC") +
guides(col=guide_legend(title="Correlation level")) +
theme(plot.title = element_text(size = 17, face="bold"), axis.title = element_text(size = 16, face="bold"), axis.text = element_text(size = 15), legend.text = element_text(size = 14), legend.title=element_text(size=15, face="bold"))
plot_file = paste(plots_dir, "disease_module_rlcc_by_correlations_pearson_pval_0.05_data_sciphercomplete.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9300,
height = 6000,
units = c("px")
)
datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
all_disease_genes_results_df %>%
filter((disease == "arthritis rheumatoid") & (type_dataset %in% datasets_selected) & (threshold == threshold_selected) & (type_correlation == "moderate-strong-very strong")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.complete.dataset", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca", "TCGA: Breast cancer")) %>%
group_by(size, type_dataset) %>%
mutate(mean_pval = mean(disease_lcc_pvalue)) %>%
ungroup() %>%
ggplot() +
geom_point(aes(x=size, y=abs(log10(disease_lcc_pvalue)), col=type_dataset), alpha=0.5, size=3) +
#geom_line(aes(x=size, y=abs(log10(mean_pval)), col=type_dataset), size=1) +
geom_hline(yintercept=as.numeric(abs(log10(0.05))), linetype="dashed", color="red", size=0.5) +
#scale_x_continuous(trans = scales::log10_trans()) +
scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
theme_linedraw() +
xlab("Number of samples") +
#xlab("log(N)") +
ylab("Disease LCC log p-value (abs)") +
guides(col=guide_legend(title="Dataset")) +
theme(plot.title = element_text(size = 17, face="bold"),
axis.title = element_text(size = 16, face="bold"),
axis.text = element_text(size = 15),
legend.text = element_text(size = 14),
legend.title=element_text(size=15, face="bold"))
plot_file = paste(plots_dir, "disease_module_significance_by_dataset_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
dpi = 1200,
width = 9200,
height = 6000,
units = c("px")
)